library(dplyr)
library(readr)
library(ggplot2)
library(openxlsx)
library(knitr)
library(tibble)
library(stringr)
library(stringi)
library(readxl)
library(lubridate)
library(shiny)
library(plotly)
Loading the Data and Removal of Training Data
# Unzip and extract ODK data from ODK zip
df <- as.data.frame(extract_data_from_odk_zip(params$file_path_zip, params$file_name_csv))
# Formatting dates from integer (in ms) to time stamp
df$start <- format_date_ms(df$start)
df$end <- format_date_ms(df$end)
# filtering for events that occurred after 18th July 21
df <- subset(df, as.Date(start) > as.Date("18.07.2021", "%d.%m.%Y"))
Deriving New Features
Time Spent per Event
# subtracting end from start date
df$time_spent = round(as.numeric(df$end - df$start))
Question
# splitting the node strings so that only the question name remains
df$question = sapply(df$node, create_question)
Question Decoded
df <- decode_question(df, df$question, params$codebook)
Categorical Answers Decoded
df <- decode_categories(df, params$codebook)
Time until a Response was Changed + Stream of Answer Changes
df <- df %>%
# bringing the data in the right order
arrange(`instance ID`, node, start) %>%
# adding two empty columns to store the new features in
add_column(time_till_change=NA) %>%
add_column(changed_from=NA)
# iterating over the df and computing the time it took until an answer was changed + adding what the question was before
for (i in 1:nrow(df)){
if (df$`old-value`[i]==df$`new-value`[i-1] && !is.na(df$`old-value`[i]) && !is.na(df$`new-value`[i-1]) ){
df$time_till_change[i] <- round(as.numeric(df$start[i]-df$end[i-1]))
} else{
next
}
}
Preview and Summary of the Final Data
head(df)
| uuid:04bcbdca-b5b6-4189-992a-5ae3765c5441 |
group questions |
/data/child_identification |
2021-08-17 12:54:01 |
2021-08-17 12:55:20 |
NA |
NA |
NA |
NA |
NA |
79 |
child_identification |
NA |
NA |
NA |
NA |
NA |
| uuid:04bcbdca-b5b6-4189-992a-5ae3765c5441 |
group questions |
/data/child_identification |
2021-08-17 22:14:47 |
2021-08-17 22:14:49 |
NA |
NA |
NA |
NA |
NA |
2 |
child_identification |
NA |
NA |
NA |
NA |
NA |
| uuid:04bcbdca-b5b6-4189-992a-5ae3765c5441 |
group questions |
/data/child_identification |
2021-08-17 22:15:37 |
2021-08-17 22:15:39 |
NA |
NA |
NA |
NA |
NA |
2 |
child_identification |
NA |
NA |
NA |
NA |
NA |
| uuid:04bcbdca-b5b6-4189-992a-5ae3765c5441 |
question |
/data/child_identification/a1_a_4a |
2021-08-17 12:54:01 |
2021-08-17 12:55:20 |
NA |
NA |
NA |
NA |
T-F0010-P0193 |
79 |
a1_a_4a |
If QR code scanning is not possible, please manually enter the participant identification code |
T-F0010-P0193 |
NA |
NA |
NA |
| uuid:04bcbdca-b5b6-4189-992a-5ae3765c5441 |
group questions |
/data/front_page |
2021-08-17 08:32:03 |
2021-08-17 08:32:05 |
NA |
NA |
NA |
NA |
NA |
2 |
front_page |
NA |
NA |
NA |
NA |
NA |
| uuid:04bcbdca-b5b6-4189-992a-5ae3765c5441 |
group questions |
/data/front_page |
2021-08-18 07:55:33 |
2021-08-18 07:55:34 |
NA |
NA |
NA |
NA |
NA |
2 |
front_page |
NA |
NA |
NA |
NA |
NA |
summary(df)
## instance ID event node start
## Length:2599 Length:2599 Length:2599 Min. :2021-07-31 13:17:06
## Class :character Class :character Class :character 1st Qu.:2021-08-18 09:47:06
## Mode :character Mode :character Mode :character Median :2021-08-19 08:55:06
## Mean :2021-08-17 12:17:01
## 3rd Qu.:2021-08-20 10:05:23
## Max. :2021-08-22 09:32:53
##
## end latitude longitude accuracy
## Min. :2021-07-31 13:21:20 Mode:logical Mode:logical Mode:logical
## 1st Qu.:2021-08-18 09:25:01 NA's:2599 NA's:2599 NA's:2599
## Median :2021-08-19 08:54:47
## Mean :2021-08-17 08:12:00
## 3rd Qu.:2021-08-20 09:34:54
## Max. :2021-08-22 09:32:48
## NA's :565
## old-value new-value time_spent question
## Length:2599 Length:2599 Min. : 0.0 Length:2599
## Class :character Class :character 1st Qu.: 6.0 Class :character
## Mode :character Mode :character Median : 37.0 Mode :character
## Mean : 393.0
## 3rd Qu.: 185.8
## Max. :109257.0
## NA's :565
## question_decoded new_value_decoded old_value_decoded time_till_change
## Length:2599 Length:2599 Length:2599 Min. : 0.000
## Class :character Class :character Class :character 1st Qu.: 2.000
## Mode :character Mode :character Mode :character Median : 6.000
## Mean : 9.938
## 3rd Qu.:12.000
## Max. :54.000
## NA's :2534
## changed_from
## Mode:logical
## NA's:2599
##
##
##
##
##
Grouped by Time
Events/Questions Started by Day
df_by_day <- df %>%
mutate(start_date = as.Date(start)) %>%
count(start_date, name = "count")
gg1 <- ggplot(df_by_day, aes(x = start_date, y = count)) +
geom_line() +
geom_smooth(alpha=0.5, colour="red", method="loess", se=F) +
labs(title = "Number of Events/Questions Started by Day with Smoothed Regression Line", y = "Number of Questions/Events Started", x = "Satrt Date") +
theme_light()
gg1

Questions/Events started by Weekday and Hour of the Day
df_wday_hour <- df %>%
mutate(wday=wday(start, label=T, week_start = 1), hour=hour(start)) %>%
count(wday, hour, name="count_wday_hour") %>%
arrange(desc(wday))
theme_heatmap <- theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(size=10),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 10),
legend.position = "none")
gg2 <- ggplot(df_wday_hour, aes(x = wday, y = hour, fill = count_wday_hour)) +
geom_tile(colour="white") +
scale_fill_gradient(low = "#fff0f0", high="#940606") +
scale_y_reverse(breaks=c(23:0), labels=c(23:0), expand = c(0,0)) +
scale_x_discrete(expand = c(0,0), position = "top") +
labs(title = "Number of Started Events/Questions by Day of Week / Hour of Day", y = "Hour of Day") +
geom_text(aes(label = count_wday_hour), size = 2) +
theme_heatmap
gg2

Distribution of Time Spent per Event/Question with largest 5 % removed
df_clean = subset(df, time_spent<quantile(df$time_spent,0.95, na.rm=TRUE))
hist(df_clean$time_spent[!is.na(df_clean$time_spent)]/60, breaks=20, xlab = "Time Spent in Minutes", main = "Histogram of the Time Spent by Question")

Aggregated by Event/Question
Count of Old-New Value Pairs
df_stream <- df %>%
filter(!is.na(time_till_change)) %>%
count(question_decoded,
old_value_decoded,
new_value_decoded,
name="count_value_pairs",
sort=TRUE) %>%
filter(count_value_pairs > 1)
df_stream
| step_end |
The child is exiting the facility |
The child is moving to the next step |
6 |
| step_type |
other |
walking |
3 |
| step_type |
other |
waiting |
2 |
| step_type |
waiting |
laboratory testing |
2 |
| step_type |
waiting |
walking |
2 |
| step_type |
walking |
other |
2 |
Aggregated by Instance
Top 10 % of Duration by Instance
df_duration_per_inst <- df %>%
group_by(`instance ID`) %>%
summarise(duration_per_inst = max(end, na.rm=T) - min(start, na.rm=T)) %>%
filter(duration_per_inst>quantile(duration_per_inst, 0.9, na.rm=TRUE)) %>%
mutate(duration_per_inst = round(seconds_to_period(duration_per_inst))) %>%
arrange(desc(duration_per_inst))
df_duration_per_inst
| uuid:40ecef80-d668-466a-8b6d-b1b3342af214 |
8d 14H 3M 16S |
| uuid:260dca23-57f3-43b8-8ee6-411783c89c45 |
2d 21H 36M 8S |
| uuid:63ba1773-d340-4ea0-bddb-03be8b8587dd |
2d 12H 58M 6S |
| uuid:e3c416ae-1d97-4b44-8a82-0deb658a76c0 |
2d 8H 51M 45S |
| uuid:696d589b-0ed3-4c70-81d3-aee201655262 |
2d 1H 42M 35S |
| uuid:16b98e4f-ac5c-4b88-8de3-13f0c67e49f1 |
1d 22H 29M 12S |
Distribution of Duration by Instance with Top 10 % excluded
df_subsetted <- df %>%
group_by(`instance ID`) %>%
summarise(duration_per_inst = max(end, na.rm=T) - min(start, na.rm=T)) %>%
filter(duration_per_inst<quantile(duration_per_inst, 0.9, na.rm=TRUE))
hist(as.numeric(df_subsetted$duration_per_inst/60), breaks=30, main="Duration per Instance in Minutes (outliers removed)", xlab="Duration in Minutes")

Irregularities and Outliers
Time Till Change Outliers (for all data without removed outliers)
df_time_till_change_outliers <- df %>%
filter(time_till_change>quantile(df$time_till_change, 0.9, na.rm=TRUE)) %>%
arrange(desc(time_till_change)) %>%
mutate(time_till_change = round(seconds_to_period(time_till_change))) %>%
select(`instance ID`,
question_decoded,
old_value_decoded,
new_value_decoded,
time_till_change)
df_time_till_change_outliers
| uuid:75276ac5-ad78-41a5-84de-f78777ae34f8 |
comments |
Clinical officer delay to start registration and consultation because there was a meeting in consultation room. |
|
|
| Caregiver and patient return back to pharmacy for the second time because the nurse attendant did not understand properly what was written in prescription, therefore the nurse attendant request caregiver to return it to clinical officer to write it properly and return back. |
Caregiver and patient waited for a long time to enter in consultation room for registration because there was a meeting in the consultation room . |
|
|
|
Clinical officer delayed to start registration because she had a meeting with visitors and staffs. Both registration and consultation conducted in the same room (consultation room). Caregiver and patient return back to pharmacy for the second time because the nurse attendant did not understand properly what was written in prescription, therefore the nurse attendant requested caregiver to return it to clinical officer to write it properly and return it to pharmacy. Clinical officer did not use pulse oximeter. Clinical officer did not use IMCI guideline. | 54S| |uuid:04bcbdca-b5b6-4189-992a-5ae3765c5441 |comments |The caregiver had 3 sick children, only 1 eligible for enrollment in TIMCI Registration service window opened at 0915hrs Time flow recorded after enrollment in TIMCI by RA Tb screening done because it’s the first time the patient is treated in the facility While waiting for 2nd consultation the clinician wasn’t in the consultation room until 1210hrs |The caregiver had 3 sick children, only 1 eligible for enrollment in TIMCI Registration service window opened at 0915hrs Time flow recorded after enrollment in TIMCI by RA Tb screening done because it’s the first time the patient is treated in the facility While waiting for 2nd consultation the clinician wasn’t in the consultation room until 1210hrs While waiting at the dispensing area there was no HCP until 1350hrs | 52S| |uuid:b77a9a81-74b2-406c-851f-b18c993e47e4 |comments |The patient moved from one step to another without having to wait for long because the were few patients until when the patient got to dispensing area where the medical attendant was out so they had to wait for sometime before being attended to |The patient moved from one step to another without having to wait for long because the were few patients until when the patient got to dispensing area where the medical attendant was out so they had to wait for sometime before being attended to They was also deviation because instead going to the dispensing area they went to timci research assistant for further documentation.Thus explaining double walking in steps 12 and 13 | 41S| |uuid:63ba1773-d340-4ea0-bddb-03be8b8587dd |comments |Dispensary crowded since it doesn’t open during the weekends CO not around due to administrative activities Consultation was done by EN instead of CO The EN started providing services at the ANC, followed by dressing room, then Consultation. Priority given to elderly patients Lab tests (MRDT and PITC) done in consultation room Dispensing of meds done by MA. |Dispensary crowded since it doesn’t open during the weekends CO not around due to administrative activities Consultation was done by EN instead of CO The EN started providing services at the ANC, followed by dressing room, then Consultation. Priority given to elderly patients Registration done in consultation room Lab tests (MRDT and PITC) done in consultation room Dispensing of meds done by MA. | 39S| |uuid:696d589b-0ed3-4c70-81d3-aee201655262 |comments |Due to the long waiting line the TIMCI RA checked if caregiver/patient are eligible for enrollment, verbal consent obtained for shadowing obtained - TIMCI enrollment done in co |-Due to the long waiting line the TIMCI RA checked if caregiver/patient are eligible for enrollment, verbal consent for shadowing obtained. - TIMCI enrollment done in the 5th step: consultation waiting area From 0929hrs to 0943hrs -In the 5th step there was no clinician in the consultation room until 1035hrs -Lab waiting area crowded, lab opened late, there were also lab referrals from community dispensaries - | 39S| |uuid:696d589b-0ed3-4c70-81d3-aee201655262 |comments |-Due to the long waiting line the TIMCI RA checked if caregiver/patient are eligible for enrollment, verbal consent for shadowing obtained. - TIMCI enrollment done in the 5th step: consultation waiting area From 0929hrs to 0943hrs -In the 5th step there was no clinician in the consultation room until 1035hrs -Lab waiting area crowded, lab opened late, there were also lab referrals from community dispensaries - |-Due to the long waiting line the TIMCI RA checked if caregiver/patient are eligible for enrollment, verbal consent for shadowing obtained. - TIMCI enrollment done in the 5th step: consultation waiting area From 0929hrs to 0943hrs -In the 5th step there was no clinician in the consultation room until 1035hrs -Lab waiting area crowded, lab opened late, there were also lab referrals from community dispensaries - In the 8th step: Lab waiting area, there was a HCP explaining on how to collect stool sample. | 31S| |uuid:887c48e5-4325-4c30-80bf-d2139b476991 |step_type |walking |other | 21S|
Histograms of Instances with Inconsistent Filling Behaviour
irregular_inst = c()
for (id in unique(df$`instance ID`)){
bin_vec = cut(df$start[df$`instance ID`==id],
breaks=10,
labels=F)
if (length(unique(bin_vec)) < 5) irregular_inst = c(irregular_inst, id)
}
paste0(length(irregular_inst), " out of ", length(unique(df$`instance ID`))," instances were found to have an inconsistent filling behaviour.")
## [1] "29 out of 53 instances were found to have an inconsistent filling behaviour."
last_bin_questions = c()
fig <- plot_ly(alpha=0.1)
for (id in irregular_inst){
temp_df = df[df$`instance ID`==id,]
temp_df$cut = cut(temp_df$start, breaks=10, labels=c("1. Part", "2. Part", "3. Part", "4. Part", "5. Part", "6. Part", "7. Part", "8. Part", "9. Part", "10. Part"))
fig <- fig %>% add_histogram(x=temp_df$cut, name=id)
last_bin_questions = c(last_bin_questions, temp_df$question_decoded[temp_df$cut=="10. Part"])
}
fig <- fig %>% layout(barmode = "overlay")
fig
kable(table(last_bin_questions) %>% as.data.frame() %>% arrange(desc(Freq)))
| comments |
16 |
| step_other |
8 |
| step_type |
6 |
| waiting_comment |
2 |
| If QR code scanning is not possible, please manually enter the participant identification code |
1 |